# Necessary libraries
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.3
##
## Adjuntando el paquete: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.4.3
##
## Adjuntando el paquete: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(Rtsne)
## Warning: package 'Rtsne' was built under R version 4.4.3
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Cargando paquete requerido: lattice
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.4.3
##
## Adjuntando el paquete: 'dbscan'
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(cluster)
## Warning: package 'cluster' was built under R version 4.4.3
library(dendextend)
## Warning: package 'dendextend' was built under R version 4.4.3
##
## ---------------------
## Welcome to dendextend version 1.19.0
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Adjuntando el paquete: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
library(mclust)
## Warning: package 'mclust' was built under R version 4.4.3
## Package 'mclust' version 6.1.1
## Type 'citation("mclust")' for citing this R package in publications.
library(pheatmap)
## Warning: package 'pheatmap' was built under R version 4.4.3
library(aricode)
## Warning: package 'aricode' was built under R version 4.4.3
Basketball Reference’s WNBA per-100-possession data for the seasons 1997 to 2024
| Stat | Description |
|---|---|
| Player | Player Name |
| Team | Team |
| Pos | Position |
| G | Games |
| MP | Minutes Played |
| GS | Games Started |
| FG | Field Goals (includes both 2-point and 3-point field goals) |
| FGA | Field Goal Attempts (includes both 2-point and 3-point attempts) |
| FG% | Field Goal Percentage; formula: FG / FGA. |
| 3P | 3-Point Field Goals |
| 3P% | 3-Point Field Goal Percentage; the formula is 3P / 3PA. |
| 3PA | 3-Point Field Goal Attempts |
| 2P | 2-Point Field Goals |
| 2PA | 2-Point Field Goal Attempts |
| 2P% | 2-Point Field Goal Percentage; the formula is 2P / 2PA. |
| FT | Free Throws |
| FT% | Free Throw Percentage; formula: FT / FTA. |
| FTA | Free Throw Attempts |
| ORB | Offensive Rebounds |
| TRB | Total Rebounds |
| AST | Assists |
| STL | Steals |
| BLK | Blocks |
| TOV | Turnovers |
| PF | Personal Fouls |
| PTS | Points |
| Season | Season Start Year |
script_dir <- getwd()
data_directory <- file.path(script_dir, 'data')
file_path <- file.path(data_directory, 'per_100.csv')
data <- read.csv(file_path)
data_shape <- dim(data)
print(paste("Number of rows:", data_shape[1]))
## [1] "Number of rows: 4935"
print(paste("Number of columns:", data_shape[2]))
## [1] "Number of columns: 29"
columns <- colnames(data)
print("Columns:")
## [1] "Columns:"
print(columns)
## [1] "Player" "Team" "Pos" "G" "MP" "G.1" "GS" "MP.1"
## [9] "FG" "FGA" "FG." "X3P" "X3PA" "X3P." "X2P" "X2PA"
## [17] "X2P." "FT" "FTA" "FT." "ORB" "TRB" "AST" "STL"
## [25] "BLK" "TOV" "PF" "PTS" "Season"
unique_players <- length(unique(data$Player))
print(paste("Number of unique players:", unique_players))
## [1] "Number of unique players: 1095"
positions <- unique(data$Pos)
players_per_position <- sapply(positions, function(pos) sum(data$Pos == pos))
print("Players per Position:")
## [1] "Players per Position:"
print(players_per_position)
## F G G-F F-C C F-G C-F
## 1475 2009 230 312 686 110 113
data$Pos <- gsub("G-F", "F-G", data$Pos)
data$Pos <- gsub("C-F", "F-C", data$Pos)
# data$Pos <- substr(data$Pos, 1, 1)
players_per_position <- sapply(positions, function(pos) sum(data$Pos == pos))
print("Players per Position:")
## [1] "Players per Position:"
print(players_per_position)
## F G G-F F-C C F-G C-F
## 1475 2009 0 425 686 340 0
similarity_g <- sum(data[, "G"] == data[, "G.1"])/length(data[, "G"])
similarity_mp <- sum(data[, "MP"] == data[, "MP.1"])/length(data[, "MP"])
if (similarity_g > 0.9) {
print(paste("Columns G and G.1 are", round(similarity_g * 100, 2), "% identical"))
data <- data[, -which(colnames(data) == "G.1")]
}
## [1] "Columns G and G.1 are 100 % identical"
if (similarity_mp > 0.9) {
print(paste("Columns MP and MP.1 are", round(similarity_mp * 100, 2), "% identical"))
data <- data[, -which(colnames(data) == "MP.1")]
}
## [1] "Columns MP and MP.1 are 100 % identical"
columns <- colnames(data)
print("New Columns:")
## [1] "New Columns:"
print(columns)
## [1] "Player" "Team" "Pos" "G" "MP" "GS" "FG" "FGA"
## [9] "FG." "X3P" "X3PA" "X3P." "X2P" "X2PA" "X2P." "FT"
## [17] "FTA" "FT." "ORB" "TRB" "AST" "STL" "BLK" "TOV"
## [25] "PF" "PTS" "Season"
column_types <- sapply(data, class)
print("Column Types:")
## [1] "Column Types:"
print(column_types)
## Player Team Pos G MP GS
## "character" "character" "character" "integer" "integer" "integer"
## FG FGA FG. X3P X3PA X3P.
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## X2P X2PA X2P. FT FTA FT.
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## ORB TRB AST STL BLK TOV
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## PF PTS Season
## "numeric" "numeric" "integer"
data$Season <- as.character(data$Season)
missing_values <- sum(is.na(data))
print(paste(" Total Missing Values:", missing_values))
## [1] " Total Missing Values: 1513"
missing_values_by_column <- colSums(is.na(data))
print("Missing Values by Column:")
## [1] "Missing Values by Column:"
print(missing_values_by_column)
## Player Team Pos G MP GS FG FGA FG. X3P X3PA
## 0 0 0 0 0 0 8 8 63 8 8
## X3P. X2P X2PA X2P. FT FTA FT. ORB TRB AST STL
## 888 8 8 91 8 8 343 8 8 8 8
## BLK TOV PF PTS Season
## 8 8 8 8 0
data_na <- data[is.na(data$FG),]
print("Rows with NA in FG column:")
## [1] "Rows with NA in FG column:"
print(data_na)
## Player Team Pos G MP GS FG FGA FG. X3P X3PA X3P. X2P X2PA X2P.
## 2127 Wanisha Smith DET G 1 0 0 NA NA NA NA NA NA NA NA NA
## 2150 Brittany Wilkins SAS C 1 0 0 NA NA NA NA NA NA NA NA NA
## 2181 Crystal Smith MIN G 1 0 0 NA NA NA NA NA NA NA NA NA
## 3345 Ameryst Alston NYL G 1 0 0 NA NA NA NA NA NA NA NA NA
## 3949 Angel McCoughtry ATL F-G 1 0 1 NA NA NA NA NA NA NA NA NA
## 4054 Emma Cannon LVA F 1 0 0 NA NA NA NA NA NA NA NA NA
## 4276 Angel McCoughtry LVA F-G 1 0 0 NA NA 0 NA NA NA NA NA 0
## 4610 Lauren Cox CON F 1 0 0 NA NA NA NA NA NA NA NA NA
## FT FTA FT. ORB TRB AST STL BLK TOV PF PTS Season
## 2127 NA NA NA NA NA NA NA NA NA NA NA 2008
## 2150 NA NA NA NA NA NA NA NA NA NA NA 2008
## 2181 NA NA NA NA NA NA NA NA NA NA NA 2008
## 3345 NA NA NA NA NA NA NA NA NA NA NA 2016
## 3949 NA NA NA NA NA NA NA NA NA NA NA 2019
## 4054 NA NA NA NA NA NA NA NA NA NA NA 2020
## 4276 NA NA NA NA NA NA NA NA NA NA NA 2021
## 4610 NA NA NA NA NA NA NA NA NA NA NA 2023
We can confidently drop these rows, as it seems it’s simply players that did not play that year. Possibly due to injury or other reasons.
data <- data[!data$MP == 0,]
data_na <- data[is.na(data$FG.),]
print("Rows with NA in FG. column:")
## [1] "Rows with NA in FG. column:"
print(data_na)
## Player Team Pos G MP GS FG FGA FG. X3P X3PA X3P. X2P
## 68 Ryneldi Becenti PHO G 1 8 0 0 0 NA 0 0 NA 0
## 88 Kim Gessig LAS F 1 4 0 0 0 NA 0 0 NA 0
## 299 Kellie Jolly Harper CLE G 1 4 0 0 0 NA 0 0 NA 0
## 304 MerleLynn Lange-Harris PHO F-C 1 3 0 0 0 NA 0 0 NA 0
## 308 Rebecca Lobo* NYL C 1 1 1 0 0 NA 0 0 NA 0
## 587 Beverly Williams IND G 1 3 0 0 0 NA 0 0 NA 0
## 602 Kristen Rasmussen UTA F 1 9 0 0 0 NA 0 0 NA 0
## 603 Angela Aycock MIN G 3 6 0 0 0 NA 0 0 NA 0
## 610 Mactabene Amachree NYL F 2 3 0 0 0 NA 0 0 NA 0
## 648 Keitha Dickerson UTA F 4 6 0 0 0 NA 0 0 NA 0
## 787 Levys Torres MIA C 2 8 0 0 0 NA 0 0 NA 0
## 810 Dana Wynne SAC F 1 3 0 0 0 NA 0 0 NA 0
## 820 Monique Ambers SAC F 2 4 0 0 0 NA 0 0 NA 0
## 947 Shantia Owens CHA F 2 6 0 0 0 NA 0 0 NA 0
## 972 Elena Shakirova CHA C 1 5 0 0 0 NA 0 0 NA 0
## 1038 Katryna Gaither WAS C 1 1 0 0 0 NA 0 0 NA 0
## 1092 Bethany Donaphin NYL F-C 1 2 0 0 0 NA 0 0 NA 0
## 1204 Itoro Umoh-Coleman HOU G 3 6 0 0 0 NA 0 0 NA 0
## 1259 Amisha Carter DET F 2 13 0 0 0 NA 0 0 NA 0
## 1284 Shaunzinski Gortman WAS G 4 24 0 0 0 NA 0 0 NA 0
## 1375 Stacey Thomas DET F 1 1 0 0 0 NA 0 0 NA 0
## 1403 Jae Kingi-Cross DET G 5 15 0 0 0 NA 0 0 NA 0
## 1590 Kiesha Brown HOU G 4 18 0 0 0 NA 0 0 NA 0
## 1621 Jessica Brungo CON F 4 7 0 0 0 NA 0 0 NA 0
## 1714 Irina Osipova DET C 2 12 0 0 0 NA 0 0 NA 0
## 1785 Ambrosia Anderson CON F 1 1 0 0 0 NA 0 0 NA 0
## 1814 Cori Chambers CON G 1 1 0 0 0 NA 0 0 NA 0
## 1878 Teana Miller PHO C 2 5 0 0 0 NA 0 0 NA 0
## 1924 Kim Smith SAC F 3 6 0 0 0 NA 0 0 NA 0
## 2318 Kelly Schumacher DET C 1 10 0 0 0 NA 0 0 NA 0
## 3004 Samantha Prahalis NYL G 3 8 0 0 0 NA 0 0 NA 0
## 3047 Aaryn Ellenberg CHI G 2 1 0 0 0 NA 0 0 NA 0
## 3500 Blake Dietrick SEA G 2 6 0 0 0 NA 0 0 NA 0
## 3677 Chelsea Hopkins NYL G 1 1 0 0 0 NA 0 0 NA 0
## 3683 Sophie Brunner SAS F 1 2 0 0 0 NA 0 0 NA 0
## 3733 Adaora Elonu ATL F 1 1 0 0 0 NA 0 0 NA 0
## 3745 Amber Harris CHI F 1 2 0 0 0 NA 0 0 NA 0
## 3862 Teana Muldrow DAL F 1 3 0 0 0 NA 0 0 NA 0
## 3958 Jaime Nared LVA F-G 1 1 0 0 0 NA 0 0 NA 0
## 4071 Kaela Davis ATL F 2 2 0 0 0 NA 0 0 NA 0
## 4183 Alisia Jenkins IND F 1 2 0 0 0 NA 0 0 NA 0
## 4186 Erica McCall ATL F 1 5 0 0 0 NA 0 0 NA 0
## 4191 Alisia Jenkins CHI F 2 3 0 0 0 NA 0 0 NA 0
## 4237 Aleah Goodman CON G 1 3 0 0 0 NA 0 0 NA 0
## 4246 Linnae Harper MIN G 1 5 0 0 0 NA 0 0 NA 0
## 4248 Mikiah Herbert Harrigan SEA F 1 1 0 0 0 NA 0 0 NA 0
## 4355 Layshia Clarendon NYL G 1 3 0 0 0 NA 0 0 NA 0
## 4360 Joyner Holmes NYL F 1 5 0 0 0 NA 0 0 NA 0
## 4364 Shatori Walker-Kimbrough CON G 1 4 0 0 0 NA 0 0 NA 0
## 4370 Natasha Mack MIN F 1 2 0 0 0 NA 0 0 NA 0
## 4405 Kaila Charles ATL F-G 1 2 0 0 0 NA 0 0 NA 0
## 4477 Raina Perez SEA G 1 2 0 0 0 NA 0 0 NA 0
## 4554 Moriah Jefferson DAL G 1 4 0 0 0 NA 0 0 NA 0
## 4569 Kiana Williams CON G 1 3 0 0 0 NA 0 0 NA 0
## 4639 Khaalia Hillsman CHI C 1 1 0 0 0 NA 0 0 NA 0
## 4872 Taylor Soule MIN F 2 3 0 0 0 NA 0 0 NA 0
## X2PA X2P. FT FTA FT. ORB TRB AST STL BLK TOV PF PTS Season
## 68 0 NA 0.0 0.0 NA 0.0 0.0 0.0 6.7 0.0 6.7 0.0 0.0 1997
## 88 0 NA 0.0 0.0 NA 0.0 13.0 0.0 0.0 0.0 13.0 25.9 0.0 1997
## 299 0 NA 0.0 0.0 NA 0.0 0.0 14.1 0.0 0.0 28.2 0.0 0.0 1999
## 304 0 NA 0.0 0.0 NA 0.0 38.1 0.0 0.0 0.0 0.0 19.1 0.0 1999
## 308 0 NA 0.0 0.0 NA 58.5 58.5 0.0 0.0 0.0 58.5 0.0 0.0 1999
## 587 0 NA 0.0 0.0 NA 0.0 0.0 0.0 0.0 0.0 0.0 38.2 0.0 2000
## 602 0 NA 0.0 0.0 NA 6.1 12.1 6.1 6.1 0.0 6.1 12.1 0.0 2000
## 603 0 NA 0.0 0.0 NA 0.0 28.7 38.2 0.0 0.0 0.0 9.6 0.0 2000
## 610 0 NA 19.6 39.3 0.50 19.6 19.6 0.0 0.0 19.6 19.6 39.3 19.6 2001
## 648 0 NA 0.0 38.4 0.00 0.0 9.6 9.6 0.0 0.0 0.0 0.0 0.0 2001
## 787 0 NA 0.0 0.0 NA 0.0 0.0 0.0 0.0 0.0 15.5 0.0 0.0 2001
## 810 0 NA 38.0 38.0 1.00 19.0 19.0 0.0 0.0 19.0 0.0 38.0 38.0 2001
## 820 0 NA 0.0 0.0 NA 0.0 0.0 0.0 0.0 0.0 0.0 28.7 0.0 2002
## 947 0 NA 0.0 0.0 NA 0.0 20.3 0.0 0.0 0.0 0.0 0.0 0.0 2002
## 972 0 NA 12.2 24.4 0.50 12.2 12.2 0.0 12.2 0.0 0.0 0.0 12.2 2002
## 1038 0 NA 0.0 0.0 NA 0.0 0.0 0.0 0.0 0.0 0.0 59.1 0.0 2002
## 1092 0 NA 0.0 0.0 NA 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2003
## 1204 0 NA 0.0 0.0 NA 0.0 0.0 10.0 0.0 0.0 10.0 10.0 0.0 2003
## 1259 0 NA 4.3 8.6 0.50 0.0 17.2 0.0 8.6 0.0 12.9 21.5 4.3 2004
## 1284 0 NA 0.0 0.0 NA 0.0 9.6 4.8 0.0 2.4 4.8 0.0 0.0 2004
## 1375 0 NA 0.0 0.0 NA 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2004
## 1403 0 NA 11.2 14.9 0.75 0.0 3.7 3.7 0.0 0.0 7.4 3.7 11.2 2004
## 1590 0 NA 0.0 0.0 NA 0.0 3.3 6.6 3.3 0.0 6.6 3.3 0.0 2005
## 1621 0 NA 0.0 0.0 NA 0.0 0.0 0.0 0.0 0.0 7.5 0.0 0.0 2006
## 1714 0 NA 8.8 8.8 1.00 4.4 4.4 0.0 0.0 0.0 4.4 4.4 8.8 2006
## 1785 0 NA 0.0 0.0 NA 0.0 104.7 0.0 0.0 0.0 0.0 0.0 0.0 2006
## 1814 0 NA 0.0 0.0 NA 0.0 0.0 0.0 0.0 0.0 103.0 0.0 0.0 2007
## 1878 0 NA 0.0 19.1 0.00 0.0 9.5 0.0 0.0 9.5 0.0 9.5 0.0 2007
## 1924 0 NA 0.0 0.0 NA 0.0 17.8 0.0 0.0 0.0 8.9 8.9 0.0 2007
## 2318 0 NA 0.0 0.0 NA 0.0 0.0 0.0 0.0 5.2 0.0 10.4 0.0 2009
## 3004 0 NA 0.0 0.0 NA 0.0 13.1 6.6 6.6 0.0 19.7 6.6 0.0 2013
## 3047 0 NA 0.0 0.0 NA 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2014
## 3500 0 NA 17.4 17.4 1.00 0.0 0.0 0.0 0.0 0.0 8.7 8.7 17.4 2016
## 3677 0 NA 0.0 0.0 NA 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2017
## 3683 0 NA 0.0 0.0 NA 0.0 0.0 0.0 0.0 0.0 0.0 26.1 0.0 2017
## 3733 0 NA 0.0 0.0 NA 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2018
## 3745 0 NA 0.0 0.0 NA 0.0 0.0 0.0 0.0 0.0 0.0 25.2 0.0 2018
## 3862 0 NA 0.0 0.0 NA 0.0 0.0 0.0 0.0 0.0 0.0 16.8 0.0 2018
## 3958 0 NA 0.0 0.0 NA 0.0 0.0 0.0 0.0 0.0 0.0 49.5 0.0 2019
## 4071 0 NA 0.0 0.0 NA 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2020
## 4183 0 NA 0.0 0.0 NA 0.0 25.5 0.0 0.0 0.0 0.0 0.0 0.0 2020
## 4186 0 NA 0.0 0.0 NA 0.0 20.0 0.0 10.0 0.0 0.0 10.0 0.0 2020
## 4191 0 NA 0.0 0.0 NA 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2020
## 4237 0 NA 0.0 0.0 NA 0.0 0.0 18.0 0.0 0.0 0.0 18.0 0.0 2021
## 4246 0 NA 0.0 0.0 NA 0.0 10.2 0.0 0.0 0.0 0.0 20.3 0.0 2021
## 4248 0 NA 0.0 0.0 NA 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2021
## 4355 0 NA 0.0 0.0 NA 0.0 0.0 0.0 0.0 0.0 33.1 16.5 0.0 2021
## 4360 0 NA 0.0 0.0 NA 0.0 0.0 9.9 0.0 0.0 9.9 9.9 0.0 2021
## 4364 0 NA 0.0 0.0 NA 0.0 13.5 0.0 0.0 13.5 13.5 13.5 0.0 2021
## 4370 0 NA 0.0 0.0 NA 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2021
## 4405 0 NA 0.0 0.0 NA 0.0 0.0 0.0 0.0 0.0 0.0 25.2 0.0 2022
## 4477 0 NA 0.0 0.0 NA 0.0 0.0 25.5 0.0 0.0 0.0 0.0 0.0 2022
## 4554 0 NA 0.0 0.0 NA 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2022
## 4569 0 NA 0.0 0.0 NA 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2022
## 4639 0 NA 0.0 0.0 NA 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2023
## 4872 0 NA 0.0 0.0 NA 0.0 17.2 0.0 0.0 0.0 17.2 0.0 0.0 2024
As we can see, that column is empty when the player did not attempt any field goals that year. Similarly for the 2, 3 points ones and the free throws. We’ll input 0 for these cases.
data$FG.[is.na(data$FG.) & data$FGA == 0] <- 0
data$X3P.[is.na(data$X3P.) & data$X3PA == 0] <- 0
data$X2P.[is.na(data$X2P.) & data$X2PA == 0] <- 0
data$FT.[is.na(data$FT.) & data$FTA == 0] <- 0
missing_values_by_column <- colSums(is.na(data))
print("Final Missing Values by Column:")
## [1] "Final Missing Values by Column:"
print(missing_values_by_column)
## Player Team Pos G MP GS FG FGA FG. X3P X3PA
## 0 0 0 0 0 0 0 0 0 0 0
## X3P. X2P X2PA X2P. FT FTA FT. ORB TRB AST STL
## 0 0 0 0 0 0 0 0 0 0 0
## BLK TOV PF PTS Season
## 0 0 0 0 0
data_shape <- dim(data)
print("Final Shape:")
## [1] "Final Shape:"
print(paste("Rows:", data_shape[1]))
## [1] "Rows: 4927"
print(paste("Columns:", data_shape[2]))
## [1] "Columns: 27"
duplicates <- data[duplicated(data),]
print("Number of Duplicates:")
## [1] "Number of Duplicates:"
print(dim(duplicates)[1])
## [1] 0
plot_distribution <- function(data, column, bins) {
p1 <- ggplot(data, aes_string(x = column)) +
geom_histogram(bins = bins, fill = "#E15101", alpha = 0.7) +
labs(title = paste("Distribution of", column)) +
theme(plot.margin = unit(c(1, 1, 1, 1), "cm"))
p2 <- ggplot(data, aes_string(y = column)) +
geom_boxplot(fill = "#E15101", alpha = 0.7) + coord_flip() +
theme(plot.margin = unit(c(0, 1, 0, 2.1), "cm"),
axis.ticks.y = element_blank(), axis.text.y = element_blank())
grid.arrange(p1, p2, ncol = 1, heights = c(3, 1))
}
for (col in colnames(data)) {
if (is.numeric(data[[col]])) {
bins <- ifelse(grepl("\\.", col), 10, 27)
plot_distribution(data, col, bins)
}
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
correlation_matrix <- cor(data[, sapply(data, is.numeric)], use = "complete.obs")
print("Correlation Matrix:")
## [1] "Correlation Matrix:"
corrplot(correlation_matrix, method = "color", tl.col = "black", tl.srt = 45, tl.cex = 0.7)
As expected, the field goals and each type of shot attempted and made is correlated with its made percentage. And the number of games played is correlated with the games started and minutes played, and the total and offensive rebounds as well. It’s interesting to note there is a slight negative correlation between rebounds and 3-point shooting, a good clue that we can find some pointers of position by the strong starts per player.
high_correlations <- which(abs(correlation_matrix) > 0.8 & abs(correlation_matrix) < 1, arr.ind = TRUE)
if (length(high_correlations) > 0) {
for (i in 1:nrow(high_correlations)) {
row <- high_correlations[i, 1]
col <- high_correlations[i, 2]
cat(sprintf("Correlation between %s and %s: %.2f\n",
rownames(correlation_matrix)[row],
colnames(correlation_matrix)[col],
correlation_matrix[row, col]))
}
} else {
print("No correlations greater than 80% found.")
}
## Correlation between MP and G: 0.80
## Correlation between G and MP: 0.80
## Correlation between GS and MP: 0.89
## Correlation between MP and GS: 0.89
## Correlation between X2P and FG: 0.89
## Correlation between PTS and FG: 0.93
## Correlation between X2P. and FG.: 0.88
## Correlation between X3PA and X3P: 0.85
## Correlation between X3P and X3PA: 0.85
## Correlation between FG and X2P: 0.89
## Correlation between X2PA and X2P: 0.85
## Correlation between X2P and X2PA: 0.85
## Correlation between FG. and X2P.: 0.88
## Correlation between FTA and FT: 0.91
## Correlation between FT and FTA: 0.91
## Correlation between FG and PTS: 0.93
For this project it’s important to differentiate between the different types of shooting, 2-point, 3-point and free throws, and we will keep the offensive and obtain the defensive rebounds, as well as the other defensive stats; and the total points. It’s important to remember these stats are already somewhat normalized, since they are the stats per 100 possessions.
data <- data %>%
mutate(DRB = TRB - ORB)
columns_to_keep <- c("Player", "Team", "Season", "Pos", "X3PA", "X3P.","X2PA", "X2P.", "FTA", "FT.", 'PTS', "ORB", "DRB", "AST", "STL", "BLK", "TOV", "PF")
data <- data %>% select(all_of(columns_to_keep))
columns <- colnames(data)
print("Columns:")
## [1] "Columns:"
print(columns)
## [1] "Player" "Team" "Season" "Pos" "X3PA" "X3P." "X2PA" "X2P."
## [9] "FTA" "FT." "PTS" "ORB" "DRB" "AST" "STL" "BLK"
## [17] "TOV" "PF"
# save data to a csv
# write.csv(data, file = file.path(data_directory, "wnba_cleaned.csv"), row.names = FALSE)
remove_outliers <- function(data) {
numeric_columns <- sapply(data, is.numeric)
for (col in names(data)[numeric_columns]) {
Q1 <- quantile(data[[col]], 0.25, na.rm = TRUE)
Q3 <- quantile(data[[col]], 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
data <- data[data[[col]] >= lower_bound & data[[col]] <= upper_bound, ]
}
return(data)
}
data_no_outliers <- remove_outliers(data)
print("Remaining data points:")
## [1] "Remaining data points:"
dim(data_no_outliers)
## [1] 3457 18
# Perform PCA
numeric_cols <- sapply(data_no_outliers, is.numeric)
pca_result <- prcomp(data_no_outliers[, numeric_cols], scale. = TRUE)
summary(pca_result)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.0166 1.5088 1.2348 0.97423 0.96258 0.91316 0.86266
## Proportion of Variance 0.2905 0.1626 0.1089 0.06779 0.06618 0.05956 0.05316
## Cumulative Proportion 0.2905 0.4531 0.5620 0.62978 0.69597 0.75553 0.80868
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.80937 0.70276 0.67697 0.64214 0.5975 0.52550 0.16044
## Proportion of Variance 0.04679 0.03528 0.03273 0.02945 0.0255 0.01973 0.00184
## Cumulative Proportion 0.85547 0.89075 0.92349 0.95294 0.9784 0.99816 1.00000
var_explained <- pca_result$sdev^2 / sum(pca_result$sdev^2)
cumvar_explained <- cumsum(var_explained)
pca_var <- data.frame(
PC = 1:length(var_explained),
var_explained = var_explained,
cum_var = cumvar_explained
)
p1 <- ggplot(pca_var, aes(x = PC, y = var_explained)) +
geom_col(fill = "#E15101", alpha = 0.7) +
geom_line(aes(y = cum_var), color = "#2e86c1", size = 1.5) +
geom_point(aes(y = cum_var), color = "#2e86c1", size = 3) +
geom_text(aes(y = cum_var, label = sprintf("%.2f", cum_var)),
vjust = -0.8,
hjust = 0.5,
size = 7) +
scale_y_continuous(
name = "Proportion of Variance Explained",
sec.axis = sec_axis(~., name = "Cumulative Proportion")
) +
labs(x = "Principal Component") +
theme_minimal() +
theme(
axis.text = element_text(size = 14), # Axis numbers
axis.title = element_text(size = 16), # Axis titles
plot.title = element_text(size = 22), # Plot title
aspect.ratio = 9/16 # Force 16:9 aspect ratio
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p1)
#ggsave("figures/pca_scree.pdf", p1, width = 16, height = 9, units = "in")
# Plot PCA results
data_pca <- as.data.frame(pca_result$x)
data_pca$Player <- data_no_outliers$Player
data_pca$Pos <- data_no_outliers$Pos
ggplot(data_pca,
aes(x = PC1, y = PC2, color = Pos, label = Player)) +
geom_point(alpha = 0.7) +
labs(title = "PCA of WNBA Player Stats", x = "Principal Component 1", y = "Principal Component 2") +
theme_minimal()
The two principal components explain 55% of the variance, which is not ideal, but it’s a good start. And we can see some clear zones for some of the positions with the first two, Guards are to the left, Forwards and Centers scattered but mostly right.
# 3D scatter plot
plot_ly(data_pca, x = ~PC1, y = ~PC2, z = ~PC3, color = ~Pos,
type = "scatter3d", mode = "markers", marker = list(size = 3)) %>%
layout(title = "3D PCA of WNBA Player Stats",
scene = list(xaxis = list(title = 'PC1'),
yaxis = list(title = 'PC2'),
zaxis = list(title = 'PC3')))
data_unique <- data_pca
data_unique$Player <- data_pca$Player
data_unique$Pos <- data_pca$Pos
data_unique <- data_unique[!duplicated(data_unique),]
# Perform t-SNE
set.seed(42)
tsne_result <- Rtsne(
data_unique, dims = 3, perplexity = 30, verbose = TRUE, max_iter = 500
)
## Performing PCA
## Read the 3456 x 50 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 3, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 1.52 seconds (sparsity = 0.037943)!
## Learning embedding...
## Iteration 50: error is 84.151462 (50 iterations in 2.75 seconds)
## Iteration 100: error is 80.952215 (50 iterations in 2.81 seconds)
## Iteration 150: error is 80.910261 (50 iterations in 1.25 seconds)
## Iteration 200: error is 80.910362 (50 iterations in 1.26 seconds)
## Iteration 250: error is 80.910384 (50 iterations in 1.12 seconds)
## Iteration 300: error is 2.659008 (50 iterations in 2.48 seconds)
## Iteration 350: error is 2.245123 (50 iterations in 1.57 seconds)
## Iteration 400: error is 2.076515 (50 iterations in 1.78 seconds)
## Iteration 450: error is 1.981314 (50 iterations in 1.87 seconds)
## Iteration 500: error is 1.921614 (50 iterations in 1.86 seconds)
## Fitting performed in 18.76 seconds.
data_tsne <- as.data.frame(tsne_result$Y)
data_tsne$Player <- data_unique$Player
data_tsne$Pos<- data_unique$Pos
print("t-SNE Data Size:")
## [1] "t-SNE Data Size:"
print(dim(data_tsne))
## [1] 3456 5
# Create a 2D scatter plot
ggplot(data_tsne, aes(x = V1, y = V2, color = Pos, label = Player)) +
geom_point(alpha = 0.7) +
labs(title = "t-SNE of WNBA Player Stats",
x = "Dimension 1", y = "Dimension 2") +
theme_minimal()
# Create a 3D scatter plot
plot_ly(data_tsne, x = ~V1, y = ~V2, z = ~V3, color = ~Pos,
type = "scatter3d", mode = "markers", marker = list(size = 2)) %>%
layout(#title = "t-SNE of WNBA Player Stats",
scene = list(xaxis = list(title = 'Dim 1'),
yaxis = list(title = 'Dim 2'),
zaxis = list(title = 'Dim 3')))
As we can see, chaining these two methods of dimensionality reduction we can see a clearer separation of the positions, The guards occupy a clear region, and the Forwards another, with the centers at the end of the cluster mixed in.
We will attempt to retrieve the positions from the stats, to see if there is such a thing as a style of play corresponding to the labeled positions. We will do so using the original data, and the one reduced and transformed through t-SNE.
calculate_wss <- function(data, k) {
kmeans_result <- kmeans(data, centers = k, nstart = 25)
return(kmeans_result$tot.withinss)
}
cross_validate_kmeans <- function(data, k_values, folds = 5) {
set.seed(42)
fold_indices <- createFolds(data[, 1], k = folds, list = TRUE, returnTrain = TRUE)
wss_values <- sapply(k_values, function(k) {
fold_wss <- sapply(fold_indices, function(indices) {
train_data <- data[indices, ]
calculate_wss(train_data, k)
})
mean(fold_wss)
})
return(wss_values)
}
numeric_cols <- sapply(data, is.numeric)
data1 <- data[, numeric_cols]
data2 <- data_tsne[, 1:3]
k_values <- 1:10
wss_values1 <- cross_validate_kmeans(data1, k_values)
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
wss_values2 <- cross_validate_kmeans(data2, k_values)
wss_df1 <- data.frame(k = k_values, wss = wss_values1)
wss_df2 <- data.frame(k = k_values, wss = wss_values2)
plot1 <- ggplot(wss_df1, aes(x = k, y = wss)) +
geom_line(color = "#2e86c1") +
geom_point(color = "#2e86c1") +
labs(title = "Original Data", x = "Number of Clusters (k)", y = "Within-Cluster Sum of Squares (WSS)") +
theme_minimal() +
theme(
axis.text = element_text(size = 16),
axis.title = element_text(size = 18),
plot.title = element_text(size = 24),
aspect.ratio = 9/16
)
plot2 <- ggplot(wss_df2, aes(x = k, y = wss)) +
geom_line(color = "#2e86c1") +
geom_point(color = "#2e86c1") +
labs(title = "Data t-SNE", x = "Number of Clusters (k)", y = "Within-Cluster Sum of Squares (WSS)") +
theme_minimal() +
theme(
axis.text = element_text(size = 16),
axis.title = element_text(size = 18),
plot.title = element_text(size = 24),
aspect.ratio = 9/16
)
combined_plots <- gridExtra::arrangeGrob(plot1, plot2, ncol = 2)
grid.arrange(combined_plots)
grid.arrange(plot1, plot2, ncol = 2)
#ggsave("figures/kmeans_elbow.pdf", combined_plots, width = 16, height = 9, units = "in")
There isn’t a clear winner. But we know our data set has 5 labels, so we will use that as the number of clusters.
# plot k_means_tsn result in 3d
kmeans_result <- kmeans(data1, centers = 5, nstart = 25)
kmeans_result_tsne <- kmeans(data2, centers = 5, nstart = 25)
plot_ly(data_tsne, x = ~V1, y = ~V2, z = ~V3, color = ~as.factor(kmeans_result_tsne$cluster),
type = "scatter3d", mode = "markers", marker = list(size = 3)) %>%
layout(#title = "3D K-Means Clustering of WNBA Player Stats",
scene = list(xaxis = list(title = 'Dim 1'),
yaxis = list(title = 'Dim 2'),
zaxis = list(title = 'Dim 3')))
cross_validate_dbscan <- function(data, eps_values, minPts_values, folds = 5) {
set.seed(42)
fold_indices <- createFolds(data[, 1], k = folds, list = TRUE, returnTrain = TRUE)
results <- expand.grid(eps = eps_values, minPts = minPts_values)
results$silhouette <- NA
for(i in 1:nrow(results)) {
eps <- results$eps[i]
minPts <- results$minPts[i]
fold_silhouette <- sapply(fold_indices, function(indices) {
train_data <- data[indices, ]
dbscan_result <- dbscan(train_data, eps = eps, minPts = minPts)
if(length(unique(dbscan_result$cluster)) > 1 &&
sum(dbscan_result$cluster == 0) < (nrow(train_data) - 1)) {
sil <- silhouette(dbscan_result$cluster, dist(train_data))
mean(sil[, 3])
} else {
return(NA)
}
})
results$silhouette[i] <- mean(fold_silhouette, na.rm = TRUE)
}
return(results)
}
eps_values <- seq(0.1, 1, by = 0.1)
minPts_values <- 3:8
results1 <- cross_validate_dbscan(data1, eps_values, minPts_values)
results2 <- cross_validate_dbscan(data2, eps_values, minPts_values)
# Plot the results
plot1 <- ggplot(results1, aes(x = eps, y = minPts, fill = silhouette)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "#2e86c1", na.value = "grey90") +
labs(title = "Original Data",
x = "Epsilon", y = "MinPoints") +
theme_minimal() +
theme(
axis.text = element_text(size = 16),
axis.title = element_text(size = 18),
plot.title = element_text(size = 24),
#aspect.ratio = 9/16
)
plot2 <- ggplot(results2, aes(x = eps, y = minPts, fill = silhouette)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "#2e86c1", na.value = "grey90") +
labs(title = "t-SNE Data",
x = "Epsilon", y = "MinPoints") +
theme_minimal() +
theme(
axis.text = element_text(size = 16),
axis.title = element_text(size = 18),
plot.title = element_text(size = 24),
#aspect.ratio = 9/16
)
combined_plots <- gridExtra::arrangeGrob(plot1, plot2, ncol = 2)
grid.arrange(combined_plots)
#ggsave("figures/dbscan.pdf", combined_plots, width = 16, height = 9, units = "in")
# Get best parameters
best_params1 <- results1[which.max(results1$silhouette), ]
best_params2 <- results2[which.max(results2$silhouette), ]
cat("Best parameters for original data:\n",
"eps =", best_params1$eps, ", minPts =", best_params1$minPts,
"\nSilhouette score:", best_params1$silhouette, "\n\n")
## Best parameters for original data:
## eps = 0.1 , minPts = 7
## Silhouette score: 0.3890026
cat("Best parameters for t-SNE data:\n",
"eps =", best_params2$eps, ", minPts =", best_params2$minPts,
"\nSilhouette score:", best_params2$silhouette, "\n")
## Best parameters for t-SNE data:
## eps = 1 , minPts = 3
## Silhouette score: 0.1128639
dbscan1 <- dbscan(data1,
eps = best_params1$eps,
minPts = best_params1$minPts)
dbscan2 <- dbscan(data2,
eps = best_params2$eps,
minPts = best_params2$minPts)
plot1 <- ggplot(data1,
aes(x = X3PA, y = AST)) +
geom_point(aes(color = factor(dbscan1$cluster)), alpha = 0.7) +
labs(title = "DBSCAN Clusters (Original Data)",
subtitle = paste("eps =", round(best_params1$eps, 2),
", minPts =", best_params1$minPts,
"\nSilhouette =", round(best_params1$silhouette, 3)),
color = "Cluster") +
theme_minimal()
plot2 <- plot_ly(data.frame(data2),
x = ~V1, y = ~V2, z = ~V3,
color = factor(dbscan2$cluster),
type = "scatter3d",
mode = "markers",
marker = list(size = 3)) %>%
layout(title = paste("DBSCAN Clusters (t-SNE Data)\n",
"eps =", round(best_params2$eps, 2),
", minPts =", best_params2$minPts,
", Silhouette =", round(best_params2$silhouette, 3)))
print(plot1)
print(plot2)
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
cat("\nOriginal Data - Number of clusters:", length(unique(dbscan1$cluster)) - 1,
"\nNoise points:", sum(dbscan1$cluster == 0), "\n")
##
## Original Data - Number of clusters: 1
## Noise points: 4915
cat("\nt-SNE Data - Number of clusters:", length(unique(dbscan2$cluster)) - 1,
"\nNoise points:", sum(dbscan2$cluster == 0), "\n")
##
## t-SNE Data - Number of clusters: 472
## Noise points: 825
DBScan is not picking up any differences between the data, putting everything in the same cluster even when we did hyperparameter tuning. We saw initially how there’s no significant spatial separation between the different player positions in our point cloud which are the cases this algorithm thrives in.
cross_validate_hclust <- function(
data, k_values, methods = c("complete", "average", "single", "ward.D2"), folds = 5) {
set.seed(42)
fold_indices <- createFolds(1:nrow(data), k = folds, list = TRUE, returnTrain = TRUE)
results <- expand.grid(k = k_values, method = methods)
results$silhouette <- NA
for(i in 1:nrow(results)) {
k <- results$k[i]
method <- as.character(results$method[i])
fold_silhouette <- sapply(fold_indices, function(indices) {
train_data <- data[indices, ]
dist_matrix <- dist(train_data, method = "euclidean")
hc <- hclust(dist_matrix, method = method)
clusters <- cutree(hc, k = k)
if(length(unique(clusters)) > 1) {
sil <- silhouette(clusters, dist_matrix)
mean(sil[, 3])
} else {
return(NA)
}
})
results$silhouette[i] <- mean(fold_silhouette, na.rm = TRUE)
}
return(results)
}
data1 <- scale(data[, numeric_cols])
data2 <- scale(data_tsne[, 1:3])
k_values <- 2:10
methods <- c("complete", "average", "single", "ward.D2")
results1 <- cross_validate_hclust(data1, k_values, methods)
results2 <- cross_validate_hclust(data2, k_values, methods)
plot1 <- ggplot(results1, aes(x = k, y = silhouette, color = method)) +
geom_line() +
geom_point() +
labs(title = "Original Data",
x = "Number of Clusters (k)",
y = "Silhouette Score") +
theme_minimal() +
theme(
axis.text = element_text(size = 18),
axis.title = element_text(size = 20),
plot.title = element_text(size = 26),
#aspect.ratio = 9/16
)
plot2 <- ggplot(results2, aes(x = k, y = silhouette, color = method)) +
geom_line() +
geom_point() +
labs(title = "t-SNE Data",
x = "Number of Clusters (k)",
y = "Silhouette Score") +
theme_minimal() +
theme(
axis.text = element_text(size = 18),
axis.title = element_text(size = 20),
plot.title = element_text(size = 26),
#aspect.ratio = 9/16
)
combined_plots <- gridExtra::arrangeGrob(plot1, plot2, ncol = 2)
grid.arrange(combined_plots)
#ggsave("figures/hierarchical_cv.pdf", combined_plots, width = 16, height = 9, units = "in")
# Get best parameters
best_params1 <- results1[which.max(results1$silhouette), ]
best_params2 <- results2[which.max(results2$silhouette), ]
cat("Best parameters for original data:\n",
"k =", best_params1$k, ", method =", best_params1$method,
"\nSilhouette score:", best_params1$silhouette, "\n\n")
## Best parameters for original data:
## k = 2 , method = 3
## Silhouette score: 0.8724845
cat("Best parameters for t-SNE data:\n",
"k =", best_params2$k, ", method =", best_params2$method,
"\nSilhouette score:", best_params2$silhouette, "\n")
## Best parameters for t-SNE data:
## k = 4 , method = 2
## Silhouette score: 0.3005942
dist_matrix1 <- dist(data1, method = "euclidean")
hc1 <- hclust(dist_matrix1, method = best_params1$method)
clusters1 <- cutree(hc1, k = best_params1$k)
dist_matrix2 <- dist(data2, method = "euclidean")
hc2 <- hclust(dist_matrix2, method = best_params2$method)
clusters2 <- cutree(hc2, k = best_params2$k)
plot1 <- ggplot(data,
aes(x = X3PA, y = AST)) +
geom_point(aes(color = factor(clusters1)), alpha = 0.7) +
labs(title = "Hierarchical Clusters (Original Data)",
subtitle = paste("k =", best_params1$k,
", method =", best_params1$method,
"\nSilhouette =", round(best_params1$silhouette, 3)),
color = "Cluster") +
theme_minimal()
plot2 <- plot_ly(data.frame(data_tsne),
x = ~V1, y = ~V2, z = ~V3,
color = factor(clusters2),
type = "scatter3d",
mode = "markers",
marker = list(size = 3)) %>%
layout(title = paste("Hierarchical Clusters (t-SNE Data)\n",
"k =", best_params2$k,
", method =", best_params2$method,
", Silhouette =", round(best_params2$silhouette, 3)))
print(plot1)
print(plot2)
cat("\nOriginal Data - Number of clusters:", length(unique(clusters1)),
"\nCluster sizes:", table(clusters1), "\n")
##
## Original Data - Number of clusters: 2
## Cluster sizes: 4926 1
cat("\nt-SNE Data - Number of clusters:", length(unique(clusters2)),
"\nCluster sizes:", table(clusters2), "\n")
##
## t-SNE Data - Number of clusters: 4
## Cluster sizes: 946 728 1059 723
par(mfrow = c(1, 2))
plot(hc1,
main = paste("Dendrogram (Original Data)\nMethod:", best_params1$method),
xlab = "",
sub = "",
cex = 0.6,
hang = -1)
rect.hclust(hc1, k = best_params1$k, border = "red")
plot(hc2,
main = paste("Dendrogram (t-SNE Data)\nMethod:", best_params2$method),
xlab = "",
sub = "",
cex = 0.6,
hang = -1)
rect.hclust(hc2, k = best_params2$k, border = "red")
par(mfrow = c(1, 1))
par(mar = c(8, 4, 4, 2))
plot(hc2,
#main = paste("t-SNE Data)\nMethod:", best_params2$method),
xlab = "",
sub = "",
cex = 0.6,
hang = -1,
axes = FALSE,
labels = FALSE
)
rect.hclust(hc2, k = best_params2$k, border = "#2e86c1")
labels <- paste0(data_tsne$Player, "\n", data_tsne$Pos)[hc2$order]
n_labels <- 16
indices <- round(seq(1, length(hc2$order), length.out = n_labels))
axis(1, at = indices,
labels = labels[indices],
las = 2,
srt = 70,
cex.axis = 0.8)
# Add y-axis
axis(2)
# pdf("figures/dendrogram.pdf", width = 16, height = 9)
# par(mar = c(12, 4, 4, 2)) # Increase bottom margin for labels
#
# plot(hc2,
# main = "",
# xlab = "",
# sub = "",
# cex = 0.8, # Base text size
# cex.main = 2, # Title size
# cex.axis = 1.2, # Axis text size
# cex.lab = 1.4, # Axis labels size
# hang = -1,
# axes = FALSE,
# labels = FALSE
# )
# rect.hclust(hc2, k = best_params2$k, border = "#2e86c1")
#
# # Add labels with increased size
# labels <- paste0(data_tsne$Player, "\n", data_tsne$Pos)[hc2$order]
# n_labels <- 16
# indices <- round(seq(1, length(hc2$order), length.out = n_labels))
# axis(1,
# at = indices,
# labels = labels[indices],
# las = 2,
# cex.axis = 1.2, # Increase label size
# padj = 0.5) # Adjust label position
# # Add y-axis
# axis(2, cex.axis = 1.2) # Increase y-axis label size
#
# dev.off()
cross_validate_gmm <- function(data, G_values, folds = 5) {
set.seed(42)
fold_indices <- createFolds(1:nrow(data), k = folds, list = TRUE, returnTrain = TRUE)
results <- data.frame(G = G_values, bic = NA)
for(i in seq_along(G_values)) {
G <- G_values[i]
# Calculate BIC scores across folds
fold_bic <- sapply(fold_indices, function(indices) {
train_data <- data[indices, ]
gmm <- Mclust(train_data, G = G)
return(gmm$bic)
})
results$bic[i] <- mean(fold_bic)
}
return(results)
}
data1 <- scale(data[, numeric_cols])
data2 <- scale(data_tsne[, 1:3])
G_values <- 1:10
results1 <- cross_validate_gmm(data1, G_values)
results2 <- cross_validate_gmm(data2, G_values)
plot1 <- ggplot(results1, aes(x = G, y = bic)) +
geom_line() +
geom_point() +
labs(title = "GMM Model Selection\n(Original Data)",
x = "Number of Components",
y = "BIC Score") +
theme_minimal()
plot2 <- ggplot(results2, aes(x = G, y = bic)) +
geom_line() +
geom_point() +
labs(title = "GMM Model Selection\n(t-SNE Data)",
x = "Number of Components",
y = "BIC Score") +
theme_minimal()
grid.arrange(plot1, plot2, ncol = 2)
best_G1 <- G_values[which.min(results1$bic)]
best_G2 <- G_values[which.min(results2$bic)]
# Plot results with improved styling
# plot1 <- ggplot(results1, aes(x = G, y = bic)) +
# geom_line(color = "#2e86c1", size = 1.5) +
# geom_point(color = "#2e86c1", size = 3) +
# labs(title = "Original Data",
# x = "Number of Components",
# y = "BIC Score") +
# theme_minimal() +
# scale_y_continuous(labels = function(x) format(x/1000, scientific = FALSE, big.mark = ",", suffix = "k")) +
# theme(
# axis.text = element_text(size = 16), # Axis numbers
# axis.title = element_text(size = 18), # Axis titles
# plot.title = element_text(size = 24), # Plot title
# aspect.ratio = 9/16 # Force 16:9 aspect ratio
# )
#
# plot2 <- ggplot(results2, aes(x = G, y = bic)) +
# geom_line(color = "#2e86c1", size = 1.5) +
# geom_point(color = "#2e86c1", size = 3) +
# labs(title = "t-SNE Data",
# x = "Number of Components",
# y = "BIC Score") +
# theme_minimal() +
# scale_y_continuous(labels = function(x) format(x/1000, scientific = FALSE, big.mark = ",", suffix = "k")) +
# theme(
# axis.text = element_text(size = 16),
# axis.title = element_text(size = 18),
# plot.title = element_text(size = 24),
# aspect.ratio = 9/16
# )
#
# # Combine plots and save
# combined_plots <- gridExtra::arrangeGrob(plot1, plot2, ncol = 2)
# grid.arrange(combined_plots)
# ggsave("figures/gmm_cv.pdf", combined_plots, width = 16, height = 6, units = "in")
gmm1 <- Mclust(data1, G = best_G1)
gmm2 <- Mclust(data2, G = best_G2)
plot1 <- ggplot(data,
aes(x = X3PA, y = X2P.)) +
geom_point(aes(color = factor(gmm1$classification)), alpha = 0.7) +
labs(title = "GMM Clusters (Original Data)",
subtitle = paste("Components =", best_G1,
"\nBIC =", round(min(results1$bic), 2)),
color = "Cluster") +
theme_minimal()
plot2 <- plot_ly(data.frame(data_tsne),
x = ~V1, y = ~V2, z = ~V3,
color = factor(gmm2$classification),
type = "scatter3d",
mode = "markers",
marker = list(size = 3)) %>%
layout(title = paste("GMM Clusters (t-SNE Data)\n",
"Components =", best_G2,
"\nBIC =", round(min(results2$bic), 2)))
print(plot1)
print(plot2)
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
cat("\nOriginal Data - Number of components:", best_G1,
"\nCluster sizes:", table(gmm1$classification), "\n")
##
## Original Data - Number of components: 1
## Cluster sizes: 4927
cat("\nt-SNE Data - Number of components:", best_G2,
"\nCluster sizes:", table(gmm2$classification), "\n")
##
## t-SNE Data - Number of components: 1
## Cluster sizes: 3456
true_labels <- as.numeric(as.factor(data$Pos))
true_labels_tsne <- as.numeric(as.factor(data_tsne$Pos))
homogeneity_kmeans1 <- NMI(true_labels, kmeans_result$cluster)
homogeneity_dbscan1 <- NMI(true_labels, dbscan1$cluster)
homogeneity_hclust1 <- NMI(true_labels, clusters1)
homogeneity_gmm1 <- NMI(true_labels, gmm1$classification)
homogeneity_kmeans2 <- NMI(true_labels_tsne, kmeans_result_tsne$cluster)
homogeneity_dbscan2 <- NMI(true_labels_tsne, dbscan2$cluster)
homogeneity_hclust2 <- NMI(true_labels_tsne, clusters2)
homogeneity_gmm2 <- NMI(true_labels_tsne, gmm2$classification)
results_df <- data.frame(
Method = c("K-means", "DBSCAN", "Hierarchical", "GMM"),
Original_Data = c(homogeneity_kmeans1, homogeneity_dbscan1,
homogeneity_hclust1, homogeneity_gmm1),
TSNE_Data = c(homogeneity_kmeans2, homogeneity_dbscan2,
homogeneity_hclust2, homogeneity_gmm2)
)
print("Homogeneity Scores:")
## [1] "Homogeneity Scores:"
print(results_df)
## Method Original_Data TSNE_Data
## 1 K-means 0.1138672811 0.3483866
## 2 DBSCAN 0.0002533170 0.1754827
## 3 Hierarchical 0.0001306353 0.2187598
## 4 GMM 0.0000000000 0.0000000
results_long <- tidyr::pivot_longer(results_df,
cols = c("Original_Data", "TSNE_Data"),
names_to = "Dataset",
values_to = "Score")
plot <- ggplot(results_long, aes(x = Method, y = Score, fill = Dataset)) +
geom_bar(stat = "identity", position = "dodge") +
labs(#title = "Clustering Methods Comparison",
y = "Homogeneity Score",
x = "Clustering Method") +
theme_minimal() +
theme(
axis.text = element_text(size = 16),
axis.title = element_text(size = 18),
plot.title = element_text(size = 24),
aspect.ratio = 9/16
)
#theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(plot)
#ggsave("figures/homogeneity_scores.pdf", plot, width = 16, height = 9, units = "in")
data_tsne$cluster <- kmeans_result_tsne$cluster
cluster_means <- data_tsne %>%
group_by(cluster) %>%
summarize(across(c("V1", "V2", "V3"), mean)) %>%
ungroup()
cluster_matrix <- as.matrix(cluster_means[, -1])
rownames(cluster_matrix) <- paste("Cluster", cluster_means$cluster)
scaled_matrix <- scale(cluster_matrix)
pheatmap(scaled_matrix,
main = "Cluster Profiles - K-means on t-SNE Data",
angle_col = 45,
display_numbers = TRUE,
number_format = "%.1f",
fontsize_number = 7,
cluster_rows = FALSE, # Don't cluster the rows
cluster_cols = TRUE) # Allow clustering of features
data$cluster <- kmeans_result$cluster
cluster_means <- data %>%
group_by(cluster) %>%
summarize(across(head(names(numeric_cols)[numeric_cols], -1), mean)) %>%
ungroup()
cluster_matrix <- as.matrix(cluster_means[, -1])
rownames(cluster_matrix) <- paste("Cluster", cluster_means$cluster)
scaled_matrix <- scale(cluster_matrix)
pheatmap(scaled_matrix,
main = "",
angle_col = 45,
display_numbers = TRUE,
number_format = "%.1f",
fontsize_number = 8,
cluster_rows = FALSE,
cluster_cols = TRUE,
legend = FALSE,
#filename = "figures/final_heatmap.pdf"
)
# For t-SNE data k-means clusters
cluster_positions <- data_tsne %>%
group_by(cluster) %>%
count(Pos) %>%
arrange(cluster, desc(n)) %>%
group_by(cluster) %>%
slice(1) %>%
ungroup()
print("Majority positions in t-SNE k-means clusters:")
## [1] "Majority positions in t-SNE k-means clusters:"
print(cluster_positions)
## # A tibble: 5 × 3
## cluster Pos n
## <int> <chr> <int>
## 1 1 G 623
## 2 2 F 430
## 3 3 F 358
## 4 4 G 760
## 5 5 F 292
# For original data k-means clusters
cluster_positions_orig <- data %>%
group_by(cluster) %>%
count(Pos) %>%
arrange(cluster, desc(n)) %>%
group_by(cluster) %>%
slice(1) %>%
ungroup()
print("\nMajority positions in original data k-means clusters:")
## [1] "\nMajority positions in original data k-means clusters:"
print(cluster_positions_orig)
## # A tibble: 5 × 3
## cluster Pos n
## <int> <chr> <int>
## 1 1 G 600
## 2 2 F 262
## 3 3 F 589
## 4 4 G 310
## 5 5 G 784
# Optional: Add percentages
cluster_details <- data_tsne %>%
group_by(cluster) %>%
summarise(
majority_pos = names(which.max(table(Pos))),
majority_count = max(table(Pos)),
total_count = n(),
percentage = round(max(table(Pos)) / n() * 100, 1)
)
print("\nDetailed cluster composition (t-SNE):")
## [1] "\nDetailed cluster composition (t-SNE):"
print(cluster_details)
## # A tibble: 5 × 5
## cluster majority_pos majority_count total_count percentage
## <int> <chr> <int> <int> <dbl>
## 1 1 G 623 658 94.7
## 2 2 F 430 646 66.6
## 3 3 F 358 771 46.4
## 4 4 G 760 797 95.4
## 5 5 F 292 584 50